# -*- coding: UTF-8 -*-
from requests_html import HTMLSession
import json
import re
session = HTMLSession()

def get_lyric(id):
    r = session.get(f'https://rapzh.com/songs/{id}')
    s = r.html.xpath('//*[@id="__next"]/div[2]/div[2]/div[1]/div')[0].text
    lines = []
    for line in s.split("\n"):
        if (line=='专辑信息'):
            break
        lines.append(line)
    lines = lines[2:]
    return lines

sentences = []
for i in range(1, 604000):
    try:
        lines = get_lyric(i)
    except:
        continue
    s = set()
    for line in lines:
        line = re.findall('[\u4e00-\u9fa5]', line)
        result = ''.join(line)
        if result != "":
            s.add(result)
    for j in s:
        sentences.append(j)
    if (i % 1000 == 0):
        print(i, len(sentences))
        with open("sentences.txt", "a", encoding="UTF-8") as f:
            for j in sentences:
                f.write(j+"\n")
        sentences = []
